package crawler;

import dao.CrawlerService;
import dao.CrawlerServiceImpl;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import util.DBConn;
import util.DateTimeUtil;
import vo.Urls;

import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

public class MyCrawler extends WebCrawler {

    //    private final static String REGEX4AUTO_LIST = "^http://www\\.cdwl\\.gov\\.cn/index\\.php\\?m=content&c=index&a=lists&catid=20(&page=([0-9]+))*$";
    private final static String REGEX4AUTO_LIST_ORI = "^http://cd\\.58\\.com/cangkucf/$";
    private final static String REGEX4AUTO_LIST = "^http://cd\\.58\\.com/cangkucf/pn([0-9]+)/$";

    private static int count = 0;
    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg|png|mp3|mp4|zip|gz))$");

    /**
     * 负责写url的匹配规则
     */
    @Override
    public boolean shouldVisit(Page referringPage, WebURL url) {
        String href = url.getURL().toLowerCase();

        boolean isBBS = !FILTERS.matcher(href).matches()  && (href.matches(REGEX4AUTO_LIST)||href.matches(REGEX4AUTO_LIST_ORI));
        if(isBBS==true){
            System.err.println("");
        }
        return isBBS;
//        return href.startsWith("http://www.cdwl.gov.cn/index.php?m=content&c=index&a=lists&catid=20");
    }

    /**
     * This function is called when a page is fetched and ready
     * to be processed by your program.
     */
    @Override
    public void visit(Page page) {
        String url = page.getWebURL().getURL();
        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String html = htmlParseData.getHtml();
            Document document = Jsoup.parse(html);

            Elements hrefs = document.select(".house-list-wrap li .pic a");
            List<Urls> list = new ArrayList<>();
            for (Element element : hrefs) {
//                    System.out.println("name: " + element.getElementsByTag("a").text() + " count: " + count++);
                String href2 = element.getElementsByTag("a").attr("href");
                Urls u = new Urls();
                    u.setCity("chengdu");
                    u.setCreatetime(DateTimeUtil.getCurrentTimeStr());
                    u.setUrl(href2);
                    u.setWeb("58同城");
                list.add(u);
            }
            CrawlerService c = new CrawlerServiceImpl();
            c.insertUrls(list);

        }


    }

    public static void main(String[] args) {
//        String s1 = "http://cd.58.com/cangkucf/?PGTID=0d100000-0006-6bff-a331-205f61abf5bf&ClickID=3";
//        String s2 = "http://cd.58.com/cangkucf/pn4";
//        String sr = "^http://cd\\.58\\.com/cangkucf/pn([0-9]+)$";//    \(pn)*([0-9]*)/\?PGTID=.*$
//        boolean b = s2.matches(sr);
//        System.err.println(b);

    }
}